library(peakRAM)
Warning: package ‘peakRAM’ was built under R version 4.2.3
memused<-peakRAM({
library("tidyverse")
library("e1071")
library("caret")
library("ggplot2")
library("reshape2")
library("plotly")
library("pryr")
library(randomForest)
})
── Attaching core tidyverse packages ────────────────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.0     ✔ readr     2.1.4
✔ forcats   1.0.0     ✔ stringr   1.5.0
✔ ggplot2   3.4.1     ✔ tibble    3.1.8
✔ lubridate 1.9.2     ✔ tidyr     1.3.0
✔ purrr     1.0.1     ── Conflicts ──────────────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errorsWarning: package ‘e1071’ was built under R version 4.2.3Warning: package ‘caret’ was built under R version 4.2.3Loading required package: lattice
Registered S3 method overwritten by 'data.table':
  method           from
  print.data.table     

Attaching package: ‘caret’

The following object is masked from ‘package:purrr’:

    lift

Warning: package ‘reshape2’ was built under R version 4.2.3
Attaching package: ‘reshape2’

The following object is masked from ‘package:tidyr’:

    smiths

Warning: package ‘plotly’ was built under R version 4.2.3Registered S3 method overwritten by 'htmlwidgets':
  method           from         
  print.htmlwidget tools:rstudio

Attaching package: ‘plotly’

The following object is masked from ‘package:ggplot2’:

    last_plot

The following object is masked from ‘package:stats’:

    filter

The following object is masked from ‘package:graphics’:

    layout

Warning: package ‘pryr’ was built under R version 4.2.3
Attaching package: ‘pryr’

The following object is masked from ‘package:dplyr’:

    where

The following objects are masked from ‘package:purrr’:

    compose, partial

Warning: package ‘randomForest’ was built under R version 4.2.3randomForest 4.7-1.1
Type rfNews() to see new features/changes/bug fixes.

Attaching package: ‘randomForest’

The following object is masked from ‘package:dplyr’:

    combine

The following object is masked from ‘package:ggplot2’:

    margin
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
CPU time usage of code chunk: 2.58 
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")
Memory usage of code chunk: 43 MiB
memused<-peakRAM({
car<-read.csv("../ca-dealers-used.csv")
})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
CPU time usage of code chunk: 4.54 
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")
Memory usage of code chunk: 172.4 MiB
# DATA PREPROCESSING
# 1. FEATURE SELECTION
# 2. REMOVING MISSING AND NAN VALUES
# 3. CONVERTING COLUMNS WITH STRING VALUES INTO CATEGORICAL NUMERIC VALUES

memused<-peakRAM({

car<-subset(car,select=c(miles, year, make, model,trim,body_type, vehicle_type, drivetrain, transmission, fuel_type,engine_size, city,price))
car_viz<-car
clean_dataset <- function(df) {
  stopifnot(is.data.frame(df))
  df <- na.omit(df) # Drop missing values
  indices_to_keep <- !apply(df, 1, function(row) any(is.na(row) | is.infinite(row) | row == -Inf)) # Check for NaN, Inf, and -Inf
  df <- df[indices_to_keep, , drop = FALSE]
  df <- as.data.frame(lapply(df, as.numeric)) # Convert the remaining columns to numeric type
  return(df)
}

for (col in names(car)) {
  # Check if the column is not a character type
  if (is.character(car[[col]])) {
    # Convert the column to a factor
    car[[col]] <- as.numeric(factor(car[[col]]))
  }
}

car_training<-car[,1:13]

car_training2<-clean_dataset(car_training) 

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
CPU time usage of code chunk: 1.76 
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")
Memory usage of code chunk: 150.3 MiB
write.csv(car_training2, file = "my_data.csv")
#BASIC EDA

hist(car_viz$price,breaks=1000,main="Price",xlab = "Price range",ylab="Count")

hist(car_viz$miles,main="Price",xlab = "Price range",ylab="Count")

#vizualizing the data for make and count of body type
start_time <- Sys.time()
mem_start <- mem_used()

ggplot(car_viz, aes(x = make)) +
  geom_bar(aes(fill = body_type)) +
  labs(x = "Make", y = "Count of Body Type", title = "Make vs Body Type") +
  theme(plot.title = element_text(size = 18),axis.text.x = element_text(angle = 65,  hjust = 1),legend.key.size = unit(0.2,'cm'),legend.text = element_text(size = 5), legend.title = element_text(size = 5),
        legend.box = "vertical")


end_time <- Sys.time()
mem_end <- mem_used()
cpu_time <- end_time - start_time
mem_used_chunk <- mem_end - mem_start
cat("CPU time usage of code chunk:",cpu_time,"\n")
CPU time usage of code chunk: 1.617222 
cat("Memory usage of code chunk:", mem_used_chunk/1000000, "MiB\n")
Memory usage of code chunk: 15.14197 MiB
start_time <- Sys.time()
mem_start <- mem_used()

plotinter<-ggplot(car_viz, aes(x = make)) +
  geom_bar(aes(fill = body_type)) +
  labs(x = "Make", y = "Count of Body Type", title = "Make vs Body Type") +
  theme(plot.title = element_text(size = 18),axis.text.x = element_text(angle = 65,  hjust = 1),legend.key.size = unit(0.2,'cm'),legend.text = element_text(size = 5), legend.title = element_text(size = 5),
        legend.box = "vertical")
ggplotly(plotinter)

end_time <- Sys.time()
mem_end <- mem_used()
cpu_time <- end_time - start_time
mem_used_chunk <- mem_end - mem_start
cat("CPU time usage of code chunk:",cpu_time,"\n")
CPU time usage of code chunk: 1.542377 
cat("Memory usage of code chunk:", mem_used_chunk/1048576, "MB\n")
Memory usage of code chunk: 18.65584 MB
#ML model TEST TRAIN SPLIT


memused<-peakRAM({
X<-car_training2[,1:12]
Y<-car_training2[,13]
set.seed(42)
splitIndex <- createDataPartition(Y, p = 0.7, list = FALSE, times = 1)
x_train <- X[splitIndex, ]
x_test <- X[-splitIndex, ]
y_train <- Y[splitIndex]
y_test <- Y[-splitIndex]

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
CPU time usage of code chunk: 0.16 
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")
Memory usage of code chunk: 114.1 MiB
#ML Linear Regression Training and testing

memused<-peakRAM({

lm_model <- train(x_train, y_train, method = "lm")
y_pred <- predict(lm_model, x_test)

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
CPU time usage of code chunk: 23.48 
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")
Memory usage of code chunk: 421.4 MiB
#Calculating metrics
mse <- mean((y_pred - y_test)^2)
mae <- mean(abs(y_pred - y_test))
rmse <- sqrt(mse)

cat("Linear Regression MSE: ", mse,"\n")
Linear Regression MSE:  216112618 
cat("Linear Regression MAE: ", mae,"\n")
Linear Regression MAE:  7118.532 
cat("Linear Regression RMSE: ", rmse,"\n")
Linear Regression RMSE:  14700.77 
object.size(lm_model)
143024064 bytes
# KNN Model
memused<-peakRAM({

knnmodel = knnreg(x_train, y_train)
y_pred <- predict(knnmodel,x_test)

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
CPU time usage of code chunk: 125.82 
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")
Memory usage of code chunk: 98.1 MiB
#Calculating metrics
mse <- mean((y_pred - y_test)^2)
mae <- mean(abs(y_pred - y_test))
rmse <- sqrt(mse)

cat("KNN MSE: ", mse,"\n")
KNN MSE:  279466774 
cat("KNN MAE: ", mae,"\n")
KNN MAE:  9075.661 
cat("KNN RMSE: ", rmse,"\n")
KNN RMSE:  16717.26 
#Random Forest

memused<-peakRAM({

#rf_model <- train(x_train, y_train,method="rf")# Use the model to predict on X_test
#y_pred <- predict(rf_model, x_test)# Evaluate the model performance


rf_model = randomForest(x = x_train,
                             y = y_train,ntree=50)
  
# Predicting the Test set results
y_pred = predict(rf_model, newdata = x_test)

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")

#Calculating metrics
mse <- mean((y_pred - y_test)^2)
mae <- mean(abs(y_pred - y_test))
rmse <- sqrt(mse)

cat("Random Forest MSE: ", mse,"\n")
cat("Random Forest MAE: ", mae,"\n")
cat("Random Forest RMSE: ", rmse,"\n")
---
title: "R Notebook training ML models and calculating efficiencies in R"
output: html_notebook
---

```{r}
library(peakRAM)
memused<-peakRAM({
library("tidyverse")
library("e1071")
library("caret")
library("ggplot2")
library("reshape2")
library("plotly")
library("pryr")
library(randomForest)
})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")
```


```{r}
memused<-peakRAM({
car<-read.csv("../ca-dealers-used.csv")
})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")

```


```{r}
# DATA PREPROCESSING
# 1. FEATURE SELECTION
# 2. REMOVING MISSING AND NAN VALUES
# 3. CONVERTING COLUMNS WITH STRING VALUES INTO CATEGORICAL NUMERIC VALUES

memused<-peakRAM({

car<-subset(car,select=c(miles, year, make, model,trim,body_type, vehicle_type, drivetrain, transmission, fuel_type,engine_size, city,price))
car_viz<-car
clean_dataset <- function(df) {
  stopifnot(is.data.frame(df))
  df <- na.omit(df) # Drop missing values
  indices_to_keep <- !apply(df, 1, function(row) any(is.na(row) | is.infinite(row) | row == -Inf)) # Check for NaN, Inf, and -Inf
  df <- df[indices_to_keep, , drop = FALSE]
  df <- as.data.frame(lapply(df, as.numeric)) # Convert the remaining columns to numeric type
  return(df)
}

for (col in names(car)) {
  # Check if the column is not a character type
  if (is.character(car[[col]])) {
    # Convert the column to a factor
    car[[col]] <- as.numeric(factor(car[[col]]))
  }
}

car_training<-car[,1:13]

car_training2<-clean_dataset(car_training) 

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")

write.csv(car_training2, file = "my_data.csv")
```
```{r}
#BASIC EDA

hist(car_viz$price,breaks=1000,main="Price",xlab = "Price range",ylab="Count")
hist(car_viz$miles,main="Price",xlab = "Price range",ylab="Count")
```


```{r}
#vizualizing the data for make and count of body type
start_time <- Sys.time()
mem_start <- mem_used()

ggplot(car_viz, aes(x = make)) +
  geom_bar(aes(fill = body_type)) +
  labs(x = "Make", y = "Count of Body Type", title = "Make vs Body Type") +
  theme(plot.title = element_text(size = 18),axis.text.x = element_text(angle = 65,  hjust = 1),legend.key.size = unit(0.2,'cm'),legend.text = element_text(size = 5), legend.title = element_text(size = 5),
        legend.box = "vertical")

end_time <- Sys.time()
mem_end <- mem_used()
cpu_time <- end_time - start_time
mem_used_chunk <- mem_end - mem_start
cat("CPU time usage of code chunk:",cpu_time,"\n")
cat("Memory usage of code chunk:", mem_used_chunk/1000000, "MiB\n")

```

```{r}
start_time <- Sys.time()
mem_start <- mem_used()

plotinter<-ggplot(car_viz, aes(x = make)) +
  geom_bar(aes(fill = body_type)) +
  labs(x = "Make", y = "Count of Body Type", title = "Make vs Body Type") +
  theme(plot.title = element_text(size = 18),axis.text.x = element_text(angle = 65,  hjust = 1),legend.key.size = unit(0.2,'cm'),legend.text = element_text(size = 5), legend.title = element_text(size = 5),
        legend.box = "vertical")
ggplotly(plotinter)

end_time <- Sys.time()
mem_end <- mem_used()
cpu_time <- end_time - start_time
mem_used_chunk <- mem_end - mem_start
cat("CPU time usage of code chunk:",cpu_time,"\n")
cat("Memory usage of code chunk:", mem_used_chunk/1048576, "MB\n")
```



```{r}
#ML model TEST TRAIN SPLIT


memused<-peakRAM({
X<-car_training2[,1:12]
Y<-car_training2[,13]
set.seed(42)
splitIndex <- createDataPartition(Y, p = 0.7, list = FALSE, times = 1)
x_train <- X[splitIndex, ]
x_test <- X[-splitIndex, ]
y_train <- Y[splitIndex]
y_test <- Y[-splitIndex]

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")


```



```{r}
#ML Linear Regression Training and testing

memused<-peakRAM({

lm_model <- train(x_train, y_train, method = "lm")
y_pred <- predict(lm_model, x_test)

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")

#Calculating metrics
mse <- mean((y_pred - y_test)^2)
mae <- mean(abs(y_pred - y_test))
rmse <- sqrt(mse)

cat("Linear Regression MSE: ", mse,"\n")
cat("Linear Regression MAE: ", mae,"\n")
cat("Linear Regression RMSE: ", rmse,"\n")

```

```{r}
#save the model and also check the size of the stored model
saveRDS(lm_model, "LinearReg.rds")
file_info <- file.info("LinearReg.rds")
file_size <- file_info$size
print(file_size)

```

```{r}
# KNN Model
memused<-peakRAM({

knnmodel = knnreg(x_train, y_train)
y_pred <- predict(knnmodel,x_test)

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")

#Calculating metrics
mse <- mean((y_pred - y_test)^2)
mae <- mean(abs(y_pred - y_test))
rmse <- sqrt(mse)

cat("KNN MSE: ", mse,"\n")
cat("KNN MAE: ", mae,"\n")
cat("KNN RMSE: ", rmse,"\n")

```


```{r}
#Random Forest

memused<-peakRAM({

#rf_model <- train(x_train, y_train,method="rf")# Use the model to predict on X_test
#y_pred <- predict(rf_model, x_test)# Evaluate the model performance


rf_model = randomForest(x = x_train,
                             y = y_train,ntree=50)
  
# Predicting the Test set results
y_pred = predict(rf_model, newdata = x_test)

})
cat("CPU time usage of code chunk:",memused$Elapsed_Time_sec,"\n")
cat("Memory usage of code chunk:",memused$Peak_RAM_Used_MiB, "MiB\n")

#Calculating metrics
mse <- mean((y_pred - y_test)^2)
mae <- mean(abs(y_pred - y_test))
rmse <- sqrt(mse)

cat("Random Forest MSE: ", mse,"\n")
cat("Random Forest MAE: ", mae,"\n")
cat("Random Forest RMSE: ", rmse,"\n")
```